from __future__ import division
from operator import itemgetter, attrgetter

import gc
import math
import matplotlib
import os
import pylab
import random
import sys
import time
from sets import Set
from scipy import stats
import numpy as np

numOfDocumentResultRetained = 0

top10PostingDict = {}
# inputFileName2 = "/home/diaosi/web-search-engine-wei/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_1_20140211Night_DEBUG_1%"
# inputFileName2 = "/home/diaosi/web-search-engine-wei/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_0_20140217Night_DEBUG_1%"
# inputFileName2 = "/data/obukai/workspace_USE_SINCE_20140217Night/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_5_20140217Night_DEBUG_1%"
# inputFileName2 = "/data/obukai/workspace_USE_SINCE_20140217Night/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_5_20140217Night_DEBUG_3%"
# inputFileName2 = "/data/obukai/workspace_USE_SINCE_20140217Night/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_5_20140217Night_DEBUG_5%"
# inputFileName2 = "/data/obukai/workspace_USE_SINCE_20140217Night/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_5_20140218Morning_DEBUG_10%"
# inputFileName2 = "/data/obukai/workspace_USE_SINCE_20140217Night/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_5_20140218Morning_DEBUG_15%"
# inputFileName2 = "/data/obukai/workspace_USE_SINCE_20140217Night/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_5_20140218Morning_DEBUG_20%"
# inputFileName2 = "/data/obukai/workspace_USE_SINCE_20140217Night/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_5_20140218Morning_DEBUG_30%"
# inputFileName2 = "/data/obukai/workspace_USE_SINCE_20140217Night/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_5_20140218Morning_DEBUG_40%"
# inputFileName2 = "/data/obukai/workspace_USE_SINCE_20140217Night/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_5_20140218Morning_DEBUG_50%"
# inputFileName2 = "/data/obukai/workspace_USE_SINCE_20140217Night/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_10_20140218Afternoon_DEBUG_1%"
# inputFileName2 = "/data/obukai/workspace_USE_SINCE_20140217Night/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_10_20140218Afternoon_DEBUG_3%"
# inputFileName2 = "/data/obukai/workspace_USE_SINCE_20140217Night/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_10_20140218Afternoon_DEBUG_5%"
# inputFileName2 = "/data/obukai/workspace_USE_SINCE_20140217Night/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_10_20140218Afternoon_DEBUG_10%"
# inputFileName2 = "/data/obukai/workspace_USE_SINCE_20140217Night/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_0_20140219Afternoon_DEBUG_1%"
# inputFileName2 = "/data/obukai/workspace_USE_SINCE_20140217Night/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_0_20140219Afternoon_DEBUG_3%"
# inputFileName2 = "/data/obukai/workspace_USE_SINCE_20140217Night/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_0_20140219Afternoon_DEBUG_5%"
# inputFileName2 = "/data/obukai/workspace_USE_SINCE_20140217Night/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_0_20140219Afternoon_DEBUG_10%"
# inputFileName2 = "/data/obukai/workspace_USE_SINCE_20140217Night/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_0_20140219Afternoon_DEBUG_15%"
# inputFileName2 = "/data/obukai/workspace_USE_SINCE_20140217Night/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_0_20140219Afternoon_DEBUG_20%"
# inputFileName2 = "/data/obukai/workspace_USE_SINCE_20140217Night/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_0_20140219Afternoon_DEBUG_30%"
# inputFileName2 = "/data/obukai/workspace_USE_SINCE_20140217Night/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_0_20140219Afternoon_DEBUG_40%"
inputFileName2 = "/data/obukai/workspace_USE_SINCE_20140217Night/web-search-engine-wei-2014-March/polyIRIndexer/importantPostingsBeingPoppedAtDifferentLevels_weight_0_20140219Afternoon_DEBUG_50%"



print "inputFileName2:",inputFileName2
inputFileHandler = open(inputFileName2,"r")
# ignore the headline
inputFileHandler.readline()
currentLine = inputFileHandler.readline()
while currentLine:
    top10PostingKey = currentLine.strip().split(" ")[0]
    if top10PostingKey not in top10PostingDict:
        top10PostingDict[top10PostingKey] = 1
    else:
        top10PostingDict[top10PostingKey] += 1
    currentLine = inputFileHandler.readline()
print "len(top10PostingDict): ",len(top10PostingDict)
inputFileHandler.close()

documentResultDict = {}
inputFileName1 = "/data/obukai/workspace_USE_SINCE_20140217Night/web-search-engine-wei-2014-March/polyIRIndexer/tail5KResults_sortedByQID_NEW_FORMAT_20140210Afternoon"
inputFileHandler = open(inputFileName1,"r")
currentLine = inputFileHandler.readline()
while currentLine:
    currentLineElements = currentLine.strip().split(" ")
    qIDInStringFormat = currentLineElements[0]
    docIDInStringFormat = currentLineElements[1]
    termIDInStringFormat = currentLineElements[2]
    documentResultKey = qIDInStringFormat + "_" + docIDInStringFormat 
    if documentResultKey not in documentResultDict:
        documentResultDict[documentResultKey] = []
        documentResultDict[documentResultKey].append(termIDInStringFormat)
    else:
        documentResultDict[documentResultKey].append(termIDInStringFormat)
    currentLine = inputFileHandler.readline()
print "len(documentResultDict): ",len(documentResultDict)
inputFileHandler.close()

for documentResultKey in documentResultDict:
    docID = documentResultKey.strip().split("_")[1]
    currentDocumentResultRetainedFlag = True
    for termID in documentResultDict[documentResultKey]:
        top10PostingKey = termID + "_" + docID 
        if top10PostingKey not in top10PostingDict:
            currentDocumentResultRetainedFlag = False
    if not currentDocumentResultRetainedFlag:
        pass
    else:
        numOfDocumentResultRetained += 1
print "numOfDocumentResultRetained: ",numOfDocumentResultRetained
exit(1)


